Load Packages
library("phyloseq")
library("ggplot2")
library("tidyr")
library("RColorBrewer")
library(reshape2)
library("vegan")
library(qiime2R)
library(DESeq2)## Warning: package 'GenomicRanges' was built under R version 3.5.1
## Warning: package 'DelayedArray' was built under R version 3.5.1
set colors
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
P <- brewer.pal(12, "Paired")
S2 <- brewer.pal(8, "Set2")
S1 <- brewer.pal(8, "Set1")
D2 <- brewer.pal(8, "Dark2")
colors<- c(P, S2, S1,D2)
colors <- rep(colors, 5)phyloseq<-qza_to_phyloseq(features="table.qza", tree = "rooted-tree.qza")## Warning in read_qza(features): Artifact was not generated with Qiime2
## 2018.4, if data is not successfully imported, please report here
## github.com/jbisanz/qiime2R/issues
## Warning in read_qza(tree): Artifact was not generated with Qiime2 2018.4,
## if data is not successfully imported, please report here github.com/
## jbisanz/qiime2R/issues
load metadata:
metatable <- read.table("SampleINFO.txt", header = TRUE)
row.names(metatable) <- metatable[[1]]
metatable<- metatable[,(-1)]
META<- sample_data(metatable)load taxonomy:
taxonomy <- read.csv("taxonomy.csv", stringsAsFactors = FALSE, header = TRUE)
names(taxonomy) <- c("row", "tax", "Confidence")
row.names(taxonomy) <-taxonomy[[1]]
taxonomy <- taxonomy[,(-1)]
taxonomy <- separate(taxonomy, tax, c("D0","D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12", "D13", "D14"), sep = ";", fill = "right")
taxonomy <- taxonomy[,c(1:7)]
taxmat <- as.matrix(taxonomy)
TAX = tax_table(taxmat)add taxonomy to phyloseq object:
ps = merge_phyloseq(phyloseq, TAX, META)subset phyloseq object to include F0 and F1 No Sugar Diet and Control Diet
psG <-subset_samples(ps,SampleType == 'Gut' & Generation %in% c('F0', 'F1') & AB== "A" & Treatment %in% c("NSD", "CD"))Transform count data to relative abundance (as %) to normalize for differences in library size.
psGra<- transform_sample_counts(psG, function(OTU) 100* OTU/sum(OTU))Plot
taxabarplot<-plot_bar(psGra, fill = "D1") + scale_y_continuous(expand = c(0, 0)) + ggtitle("") + scale_fill_manual(values=colors) + theme(legend.title=element_blank()) + geom_bar(aes( fill=D3), stat="identity", position="stack")
taxabarplot + theme(legend.position="none")prevdf = apply(X = otu_table(psG),
MARGIN = ifelse(taxa_are_rows(psG), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
prevdf = data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(psG),
tax_table(psG))
#plyr::ddply(prevdf, "D2", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})prevalence plot:
prevplot1<-ggplot(prevdf, aes(TotalAbundance, Prevalence / nsamples(ps),color=D2)) +
geom_hline(yintercept = 0.025, alpha = 0.5, linetype = 2) + geom_point(size = 2, alpha = 0.7) +
theme_bw()+
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap(~D1) + theme(legend.position="none")
prevplot1## Warning: Transformation introduced infinite values in continuous x-axis
Filter at 2.5% prevalence
prevalenceThreshold = 0.025 * nsamples(psG)
keepTaxa = rownames(prevdf)[(prevdf$Prevalence >= prevalenceThreshold)]
ps2 = prune_taxa(keepTaxa, psG)
#table(tax_table(ps2)[, "D2"], exclude = NULL)prevdf = apply(X = otu_table(ps2),
MARGIN = ifelse(taxa_are_rows(ps2), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
prevdf = data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(ps2),
tax_table(ps2))
#plyr::ddply(prevdf, "D2", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})
prevplot1<-ggplot(prevdf, aes(TotalAbundance, Prevalence / nsamples(ps),color=D2)) +
geom_hline(yintercept = 0.01, alpha = 0.5, linetype = 2) + geom_point(size = 2, alpha = 0.7) +
theme_bw()+
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap(~D1) + theme(legend.position="none")
prevplot1 ### taxonomic
ps0 <- subset_taxa(ps2, !is.na(D1) & !D1 %in% c("", "uncharacterized", "p__"))Transform count data to relative abundance (as %) to normalize for differences in library size.
ps0ra<- transform_sample_counts(ps0, function(OTU) 100* OTU/sum(OTU))Plot
taxabarplot<-plot_bar(ps0ra, fill = "D1") + scale_y_continuous(expand = c(0, 0)) + ggtitle("") + scale_fill_manual(values=colors) + theme(legend.title=element_blank()) + geom_bar(aes( fill=D3), stat="identity", position="stack")
taxabarplot + theme(legend.position="none")Determine the Bray-Curtis distances between samples and perform Principal Coordinate Analysis (PCoA). Plot the PCoA.
ordu = ordinate(ps0ra, "PCoA", "bray")
pbc<-plot_ordination(ps0ra, ordu, color="Treatment", shape = "Generation")+theme_bw() +scale_color_manual(values=P)+ geom_point(size=3)
pbcordu2 = ordinate(ps0ra, "PCoA", "unifrac", weighted=TRUE)
pwu<-plot_ordination(ps0ra, ordu2, color="Treatment", shape = "Generation")+theme_bw() +scale_color_manual(values=P)+ geom_point(size=3)
pwuordu3 = ordinate(ps0ra, "PCoA", "unifrac", weighted=FALSE)
puu<-plot_ordination(ps0ra, ordu3, color="Treatment", shape = "Generation")+theme_bw() +scale_color_manual(values=P)+ geom_point(size=3)
puuF0 <- subset_samples(ps0ra, Generation =="F0")set.seed(1)
OTUs <- t(data.frame(otu_table(F0))) #get data frame of OTUs from phyloseq object
meta <- subset(metatable, SampleType == 'Gut' & Generation == 'F0' & Treatment %in% c("NSD", "CD") & AB == "A")
adonis(vegdist(OTUs, method = "bray") ~ Treatment, data = meta)##
## Call:
## adonis(formula = vegdist(OTUs, method = "bray") ~ Treatment, data = meta)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Treatment 1 0.51751 0.51751 11.411 0.44905 0.001 ***
## Residuals 14 0.63495 0.04535 0.55095
## Total 15 1.15246 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
F1 <- subset_samples(ps0ra, Generation =="F1")set.seed(1)
OTUs <- t(data.frame(otu_table(F1))) #get data frame of OTUs from phyloseq object
meta <- subset(metatable, SampleType == 'Gut' & Generation == 'F1' & Treatment %in% c("NSD", "CD"))
adonis(vegdist(OTUs, method = "bray") ~ Treatment, data = meta)##
## Call:
## adonis(formula = vegdist(OTUs, method = "bray") ~ Treatment, data = meta)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Treatment 1 0.1626 0.162603 1.8789 0.06288 0.119
## Residuals 28 2.4232 0.086542 0.93712
## Total 29 2.5858 1.00000
F1counts <- subset_samples(ps0, Generation == "F1")
ddse <- phyloseq_to_deseq2(F1counts, ~Treatment)## converting counts to integer mode
ddse2 <- DESeq(ddse, test="Wald", fitType="parametric")## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 171 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res<- results(ddse2, cooksCutoff = FALSE, contrast=c("Treatment", "NSD", "CD"))
alpha = 0.05
sigtab = res[which(res$padj < alpha), ]
head(sigtab)## log2 fold change (MLE): Treatment NSD vs CD
## Wald test p-value: Treatment NSD vs CD
## DataFrame with 1 row and 6 columns
## baseMean log2FoldChange
## <numeric> <numeric>
## 5a03c3cd332d6ab4c481eb319db5f239 6.63313637049478 21.8036526377338
## lfcSE stat
## <numeric> <numeric>
## 5a03c3cd332d6ab4c481eb319db5f239 2.93498708238701 7.42887516220378
## pvalue padj
## <numeric> <numeric>
## 5a03c3cd332d6ab4c481eb319db5f239 1.09525051861377e-13 2.70526878097601e-11
dim(sigtab)## [1] 1 6
Only one SV is significantly differentially abundant …
Taxonomy?
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(F1counts)[rownames(sigtab), ], "matrix"))
sigtab## baseMean log2FoldChange lfcSE stat
## 5a03c3cd332d6ab4c481eb319db5f239 6.633136 21.80365 2.934987 7.428875
## pvalue padj D0
## 5a03c3cd332d6ab4c481eb319db5f239 1.095251e-13 2.705269e-11 k__Bacteria
## D1
## 5a03c3cd332d6ab4c481eb319db5f239 p__Proteobacteria
## D2
## 5a03c3cd332d6ab4c481eb319db5f239 c__Gammaproteobacteria
## D3 D4
## 5a03c3cd332d6ab4c481eb319db5f239 o__Pseudomonadales f__Pseudomonadaceae
## D5 D6
## 5a03c3cd332d6ab4c481eb319db5f239 g__Pseudomonas <NA>
Pseudomonas (same as last round of analysis, I think?)
subset phyloseq object to include F0 and F1 No Sugar Diet and Control Diet
psM <-subset_samples(ps,SampleType == 'Media' & Generation %in% c('F0', 'F1') & AB== "A" & Treatment %in% c("NSD", "CD"))Transform count data to relative abundance (as %) to normalize for differences in library size.
psMra<- transform_sample_counts(psM, function(OTU) 100* OTU/sum(OTU))Plot
taxabarplot<-plot_bar(psMra, fill = "D3") + scale_y_continuous(expand = c(0, 0)) + ggtitle("") + scale_fill_manual(values=colors) + theme(legend.title=element_blank()) + geom_bar(aes( fill=D3), stat="identity", position="stack")
taxabarplot + theme(legend.position="none")prevdf = apply(X = otu_table(psM),
MARGIN = ifelse(taxa_are_rows(psM), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
prevdf = data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(psM),
tax_table(psM))
#plyr::ddply(prevdf, "D2", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})prevalence plot:
prevplot1<-ggplot(prevdf, aes(TotalAbundance, Prevalence / nsamples(ps),color=D2)) +
geom_hline(yintercept = 0.025, alpha = 0.5, linetype = 2) + geom_point(size = 2, alpha = 0.7) +
theme_bw()+
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap(~D1) + theme(legend.position="none")
prevplot1## Warning: Transformation introduced infinite values in continuous x-axis
No prevalence filtering for media samples (at least for now) - just remove NA at Phylum level
ps0 <- subset_taxa(psM, !is.na(D1) & !D1 %in% c("", "uncharacterized", "p__"))ddse <- phyloseq_to_deseq2(ps0, ~Treatment)## converting counts to integer mode
dds <- ddse[ rowSums(counts(ddse)) > 1, ]
rld <- rlog(dds, blind=TRUE)## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## Warning in sparseTest(counts(object, normalized = TRUE), 0.9, 100, 0.1): the rlog assumes that data is close to a negative binomial distribution, an assumption
## which is sometimes not compatible with datasets where many genes have many zero counts
## despite a few very large counts.
## In this data, for 21.1% of genes with a sum of normalized counts above 100, it was the case
## that a single sample's normalized count made up more than 90% of the sum over all samples.
## the threshold for this warning is 10% of genes. See plotSparsity(dds) for a visualization of this.
## We recommend instead using the varianceStabilizingTransformation or shifted log (see vignette).
df <- as.data.frame(assay(rld))add taxonomy
tab = cbind(df, as(tax_table(ps0)[rownames(df), ], "matrix"))
tab <- tab[1:36]m <- melt(tab, id=c("D0", "D1", "D2", "D3"))plot D1
tabplot<-ggplot(m, aes(x=variable, y=value, color = D1)) + geom_point(size=3, alpha = 0.7) + theme(legend.title=element_blank()) + theme_bw() +
ggtitle("log2 normalized counts ") +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))# +scale_color_manual(values=P)
tabplot plot D2
tabplot<-ggplot(m, aes(x=variable, y=value, color = D2)) + geom_point(size=3, alpha = 0.7) + theme(legend.title=element_blank()) + theme_bw() +
ggtitle("log2 normalized counts ") +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))# +scale_color_manual(values=P)
tabplot plot D3
tabplot<-ggplot(m, aes(x=variable, y=value, color = D3)) + geom_point(size=3, alpha = 0.7) + theme(legend.title=element_blank()) + theme_bw() +
ggtitle("log2 normalized counts ") +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))# +scale_color_manual(values=P)
tabplotGet rid of Outliers and plot again
mNOout <- m[m$value < 30,]plot D1
tabplot<-ggplot(mNOout, aes(x=variable, y=value, color = D1)) + geom_point(size=3, alpha = 0.7) + theme(legend.title=element_blank()) + theme_bw() +
ggtitle("log2 normalized counts ") +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))# +scale_color_manual(values=P)
tabplotplot D2
tabplot<-ggplot(mNOout, aes(x=variable, y=value, color = D2)) + geom_point(size=3, alpha = 0.7) + theme(legend.title=element_blank()) + theme_bw() +
ggtitle("log2 normalized counts ") +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))# +scale_color_manual(values=P)
tabplotplot D3
tabplot<-ggplot(mNOout, aes(x=variable, y=value, color = D3)) + geom_point(size=2, alpha = 0.7) + theme(legend.title=element_blank()) + theme_bw() +
ggtitle("log2 normalized counts ") +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))# +scale_color_manual(values=colors)
tabplot